<?php

/*
 * Copyright (C) 2009 - 2011 Pham Cong Dinh
 *
 * This file is part of Spica.
 *
 * This is free software; you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 3 of
 * the License, or (at your option) any later version.
 *
 * This software is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this software; if not, write to the Free
 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
 * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
 */

/**
 * Methods to handles character encoding issues.
 *
 * @category   spica
 * @package    core
 * @subpackage utils
 * @author     Pham Cong Dinh <pcdinh at phpvietnam dot net>
 * @since      Version 0.3
 * @since      March 13, 2010
 * @copyright  Pham Cong Dinh (http://www.phpvietnam.net)
 * @license    http://www.gnu.org/licenses/lgpl-3.0.txt
 * @version    $Id: CharacterUtils.php 1869 2011-01-07 18:55:25Z pcdinh $
 */
class SpicaCharacterUtils
{
    /**
     * The map between HTML entities (as defined by XHTML 1.0) to UTF-8 encoding
     * EXCEPT: lt (<), gt (>), amp (&), quot ("), and apos (')
     *
     * @var array
     */
    public static $htmlEntToUtf8 = array(
        '&nbsp;' => "\xC2\xA0", '&iexcl;' => "\xC2\xA1", '&cent;' => "\xC2\xA2",
        '&pound;' => "\xC2\xA3", '&curren;' => "\xC2\xA4", '&yen;' => "\xC2\xA5",
        '&brvbar;' => "\xC2\xA6", '&sect;' => "\xC2\xA7", '&uml;' => "\xC2\xA8",
        '&copy;' => "\xC2\xA9", '&ordf;' => "\xC2\xAA", '&laquo;' => "\xC2\xAB",
        '&not;' => "\xC2\xAC", '&shy;' => "\xC2\xAD", '&reg;' => "\xC2\xAE",
        '&macr;' => "\xC2\xAF", '&deg;' => "\xC2\xB0", '&plusmn;' => "\xC2\xB1",
        '&sup2;' => "\xC2\xB2", '&sup3;' => "\xC2\xB3", '&acute;' => "\xC2\xB4",
        '&micro;' => "\xC2\xB5", '&para;' => "\xC2\xB6", '&middot;' => "\xC2\xB7",
        '&cedil;' => "\xC2\xB8", '&sup1;' => "\xC2\xB9", '&ordm;' => "\xC2\xBA",
        '&raquo;' => "\xC2\xBB", '&frac14;' => "\xC2\xBC", '&frac12;' => "\xC2\xBD",
        '&frac34;' => "\xC2\xBE", '&iquest;' => "\xC2\xBF", '&Agrave;' => "\xC3\x80",
        '&Aacute;' => "\xC3\x81", '&Acirc;' => "\xC3\x82", '&Atilde;' => "\xC3\x83",
        '&Auml;' => "\xC3\x84", '&Aring;' => "\xC3\x85", '&AElig;' => "\xC3\x86",
        '&Ccedil;' => "\xC3\x87", '&Egrave;' => "\xC3\x88", '&Eacute;' => "\xC3\x89",
        '&Ecirc;' => "\xC3\x8A", '&Euml;' => "\xC3\x8B", '&Igrave;' => "\xC3\x8C",
        '&Iacute;' => "\xC3\x8D", '&Icirc;' => "\xC3\x8E", '&Iuml;' => "\xC3\x8F",
        '&ETH;' => "\xC3\x90", '&Ntilde;' => "\xC3\x91", '&Ograve;' => "\xC3\x92",
        '&Oacute;' => "\xC3\x93", '&Ocirc;' => "\xC3\x94", '&Otilde;' => "\xC3\x95",
        '&Ouml;' => "\xC3\x96", '&times;' => "\xC3\x97", '&Oslash;' => "\xC3\x98",
        '&Ugrave;' => "\xC3\x99", '&Uacute;' => "\xC3\x9A", '&Ucirc;' => "\xC3\x9B",
        '&Uuml;' => "\xC3\x9C", '&Yacute;' => "\xC3\x9D", '&THORN;' => "\xC3\x9E",
        '&szlig;' => "\xC3\x9F", '&agrave;' => "\xC3\xA0", '&aacute;' => "\xC3\xA1",
        '&acirc;' => "\xC3\xA2", '&atilde;' => "\xC3\xA3", '&auml;' => "\xC3\xA4",
        '&aring;' => "\xC3\xA5", '&aelig;' => "\xC3\xA6", '&ccedil;' => "\xC3\xA7",
        '&egrave;' => "\xC3\xA8", '&eacute;' => "\xC3\xA9", '&ecirc;' => "\xC3\xAA",
        '&euml;' => "\xC3\xAB", '&igrave;' => "\xC3\xAC", '&iacute;' => "\xC3\xAD",
        '&icirc;' => "\xC3\xAE", '&iuml;' => "\xC3\xAF", '&eth;' => "\xC3\xB0",
        '&ntilde;' => "\xC3\xB1", '&ograve;' => "\xC3\xB2", '&oacute;' => "\xC3\xB3",
        '&ocirc;' => "\xC3\xB4", '&otilde;' => "\xC3\xB5", '&ouml;' => "\xC3\xB6",
        '&divide;' => "\xC3\xB7", '&oslash;' => "\xC3\xB8", '&ugrave;' => "\xC3\xB9",
        '&uacute;' => "\xC3\xBA", '&ucirc;' => "\xC3\xBB", '&uuml;' => "\xC3\xBC",
        '&yacute;' => "\xC3\xBD", '&thorn;' => "\xC3\xBE", '&yuml;' => "\xC3\xBF",
        '&OElig;' => "\xC5\x92", '&oelig;' => "\xC5\x93",
        '&Scaron;' => "\xC5\xA0", '&scaron;' => "\xC5\xA1", '&Yuml;' => "\xC5\xB8",
        '&circ;' => "\xCB\x86", '&tilde;' => "\xCB\x9C", '&ensp;' => "\xE2\80\82",
        '&emsp;' => "\xE2\80\83", '&thinsp;' => "\xE2\80\89", '&zwnj;' => "\xE2\80\8C",
        '&zwj;' => "\xE2\80\8D", '&lrm;' => "\xE2\80\8E", '&rlm;' => "\xE2\80\8F",
        '&ndash;' => "\xE2\80\93", '&mdash;' => "\xE2\80\94", '&lsquo;' => "\xE2\80\98",
        '&rsquo;' => "\xE2\80\99", '&sbquo;' => "\xE2\80\9A", '&ldquo;' => "\xE2\80\9C",
        '&rdquo;' => "\xE2\80\9D", '&bdquo;' => "\xE2\80\9E", '&dagger;' => "\xE2\80\A0",
        '&Dagger;' => "\xE2\80\A1", '&permil;' => "\xE2\80\B0", '&lsaquo;' => "\xE2\80\B9",
        '&rsaquo;' => "\xE2\80\BA", '&euro;' => "\xE2\x82\xAC", '&fnof;' => "\xC6\x92",
        '&Alpha;' => "\xCE\x91", '&Beta;' => "\xCE\x92", '&Gamma;' => "\xCE\x93",
        '&Delta;' => "\xCE\x94", '&Epsilon;' => "\xCE\x95", '&Zeta;' => "\xCE\x96",
        '&Eta;' => "\xCE\x97", '&Theta;' => "\xCE\x98", '&Iota;' => "\xCE\x99",
        '&Kappa;' => "\xCE\x9A", '&Lambda;' => "\xCE\x9B", '&Mu;' => "\xCE\x9C",
        '&Nu;' => "\xCE\x9D", '&Xi;' => "\xCE\x9E", '&Omicron;' => "\xCE\x9F",
        '&Pi;' => "\xCE\xA0", '&Rho;' => "\xCE\xA1", '&Sigma;' => "\xCE\xA3",
        '&Tau;' => "\xCE\xA4", '&Upsilon;' => "\xCE\xA5", '&Phi;' => "\xCE\xA6",
        '&Chi;' => "\xCE\xA7", '&Psi;' => "\xCE\xA8", '&Omega;' => "\xCE\xA9",
        '&alpha;' => "\xCE\xB1", '&beta;' => "\xCE\xB2", '&gamma;' => "\xCE\xB3",
        '&delta;' => "\xCE\xB4", '&epsilon;' => "\xCE\xB5", '&zeta;' => "\xCE\xB6",
        '&eta;' => "\xCE\xB7", '&theta;' => "\xCE\xB8", '&iota;' => "\xCE\xB9",
        '&kappa;' => "\xCE\xBA", '&lambda;' => "\xCE\xBB", '&mu;' => "\xCE\xBC",
        '&nu;' => "\xCE\xBD", '&xi;' => "\xCE\xBE", '&omicron;' => "\xCE\xBF",
        '&pi;' => "\xCF\x80", '&rho;' => "\xCF\x81", '&sigmaf;' => "\xCF\x82",
        '&sigma;' => "\xCF\x83", '&tau;' => "\xCF\x84", '&upsilon;' => "\xCF\x85",
        '&phi;' => "\xCF\x86", '&chi;' => "\xCF\x87", '&psi;' => "\xCF\x88",
        '&omega;' => "\xCF\x89", '&thetasym;' => "\xCF\x91", '&upsih;' => "\xCF\x92",
        '&piv;' => "\xCF\x96", '&bull;' => "\xE2\80\A2", '&hellip;' => "\xE2\80\A6",
        '&prime;' => "\xE2\80\B2", '&Prime;' => "\xE2\80\B3", '&oline;' => "\xE2\80\BE",
        '&frasl;' => "\xE2\81\84", '&weierp;' => "\xE2\84\98", '&image;' => "\xE2\84\91",
        '&real;' => "\xE2\84\9C", '&trade;' => "\xE2\84\A2", '&alefsym;' => "\xE2\84\B5",
        '&larr;' => "\xE2\86\90", '&uarr;' => "\xE2\86\91", '&rarr;' => "\xE2\86\92",
        '&darr;' => "\xE2\86\93", '&harr;' => "\xE2\86\94", '&crarr;' => "\xE2\86\B5",
        '&lArr;' => "\xE2\87\90", '&uArr;' => "\xE2\87\91", '&rArr;' => "\xE2\87\92",
        '&dArr;' => "\xE2\87\93", '&hArr;' => "\xE2\87\94", '&forall;' => "\xE2\88\80",
        '&part;' => "\xE2\88\82", '&exist;' => "\xE2\88\83", '&empty;' => "\xE2\88\85",
        '&nabla;' => "\xE2\88\87", '&isin;' => "\xE2\88\88", '&notin;' => "\xE2\88\89",
        '&ni;' => "\xE2\88\8B", '&prod;' => "\xE2\88\8F", '&sum;' => "\xE2\88\91",
        '&minus;' => "\xE2\88\92", '&lowast;' => "\xE2\88\97", '&radic;' => "\xE2\88\9A",
        '&prop;' => "\xE2\88\9D", '&infin;' => "\xE2\88\9E", '&ang;' => "\xE2\88\A0",
        '&and;' => "\xE2\88\A7", '&or;' => "\xE2\88\A8", '&cap;' => "\xE2\88\A9",
        '&cup;' => "\xE2\88\AA", '&int;' => "\xE2\88\AB", '&there4;' => "\xE2\88\B4",
        '&sim;' => "\xE2\88\BC", '&cong;' => "\xE2\89\85", '&asymp;' => "\xE2\89\88",
        '&ne;' => "\xE2\89\A0", '&equiv;' => "\xE2\89\A1", '&le;' => "\xE2\89\A4",
        '&ge;' => "\xE2\89\A5", '&sub;' => "\xE2\8A\82", '&sup;' => "\xE2\8A\83",
        '&nsub;' => "\xE2\8A\84", '&sube;' => "\xE2\8A\86", '&supe;' => "\xE2\8A\87",
        '&oplus;' => "\xE2\8A\95", '&otimes;' => "\xE2\8A\97", '&perp;' => "\xE2\8A\A5",
        '&sdot;' => "\xE2\8B\85", '&lceil;' => "\xE2\8C\88", '&rceil;' => "\xE2\8C\89",
        '&lfloor;' => "\xE2\8C\8A", '&rfloor;' => "\xE2\8C\8B", '&lang;' => "\xE2\8C\A9",
        '&rang;' => "\xE2\8C\AA", '&loz;' => "\xE2\97\8A", '&spades;' => "\xE2\99\A0",
        '&clubs;' => "\xE2\99\A3", '&hearts;' => "\xE2\99\A5", '&diams;' => "\xE2\99\A6"
    );

    /**
     * Prevents construction of an object of <code>SpicaCharacterReader</code>.
     */
    protected function __construct()
    {

    }

    /**
     * Converts HTML entities (as defined in XHTML 1.0 spec) to UTF-8-encoded Unicode except: lt, gt, amp, quot, and apos
     *
     * @param $str Html entities string
     */
    public static function htmlEntToUtf8($str)
    {
        return strtr($str, self::$htmlEntToUtf8);
    }

    /**
     * Converts ISO Latin 9 string (ISO-8859-15) to UTF-8 string.
     *
     * Written by Klaus A. Brunner, 2002. Public domain code
     *
     * @param $latin9
     * @return string
     */
    public static function latin9ToUtf8($latin9)
    {
        $len = strlen($latin9);

        for ($i = 0; $i < $len; $i++)
        {
            $asciiValue = ord($latin9[$i]);

            if ($asciiValue <= 0x7F)
            {
                $utf8 .= chr($asciiValue);
            }
            elseif ($asciiValue == 0xA4)
            {
                $utf8 .= "\xE2\x82\xAC"; // euro sign
            }
            else
            {
                switch ($asciiValue)
                {
                    case 0xBC:
                        $asciiValue = 0x152;
                        break;  // OE ligature
                    case 0xBD:
                        $asciiValue = 0x153;
                        break;  // oe ligature
                    case 0xA6:
                        $asciiValue = 0x160;
                        break;  // S with caron
                    case 0xA8:
                        $asciiValue = 0x161;
                        break;  // s with caron
                    case 0xBE:
                        $asciiValue = 0x178;
                        break;  // Y with diaeresis
                    case 0xB4:
                        $asciiValue = 0x17D;
                        break;  // Z with caron
                    case 0xB8:
                        $asciiValue = 0x17E;
                        break;  // z with caron
                }

                $utf8 .= chr(0xC0 | ($asciiValue >> 6)) . chr(0x80 | ($asciiValue & 0x3F));
            }
        }

        return $utf8;
    }

    /**
     * Converts ISO Latin 1 string (ISO-8859-1) to UTF-8 string.
     *
     * Written by Klaus A. Brunner, 2002. Public domain code
     *
     * @param $latin1
     * @return string
     */
    public static function latin1ToUtf8($latin1)
    {
        $len = strlen($latin1);

        for ($i = 0; $i < $len; $i++)
        {
            $asciiValue = ord($latin1[$i]);

            if ($asciiValue <= 0x7F)
            {
                $utf8 .= chr($asciiValue);
            }
            else
            {
                $utf8 .= chr(0xC0 | ($asciiValue >> 6)) . chr(0x80 | ($asciiValue & 0x3F));
            }
        }

        return $utf8;
    }

    /**
     * Converts utf-8 to win-1251.
     *
     * @param  string $str
     * @return string
     */
    public static function utf8ToCp1251($string)
    {
        static $table = array(
        "\xD0\x81" => "\xA8",
        "\xD1\x91" => "\xB8",
        "\xD0\x8E" => "\xA1",
        "\xD1\x9E" => "\xA2",
        "\xD0\x84" => "\xAA",
        "\xD0\x87" => "\xAF",
        "\xD0\x86" => "\xB2",
        "\xD1\x96" => "\xB3",
        "\xD1\x94" => "\xBA",
        "\xD1\x97" => "\xBF",
        "\xD3\x90" => "\x8C",
        "\xD3\x96" => "\x8D",
        "\xD2\xAA" => "\x8E",
        "\xD3\xB2" => "\x8F",
        "\xD3\x91" => "\x9C",
        "\xD3\x97" => "\x9D",
        "\xD2\xAB" => "\x9E",
        "\xD3\xB3" => "\x9F");

        return preg_replace('#([\xD0-\xD1])([\x80-\xBF])#se', 'isset($table["$0"]) ? $table["$0"] : chr(ord("$2")+("$1" == "\xD0" ? 0x30 : 0x70))', $string);
    }

    /**
     * Converts win-1251 to utf-8.
     *
     * @param  string $string
     * @return string
     */
    public static function cp1251ToUtf8($string)
    {
        static $table = array(
        "\xA8" => "\xD0\x81",
        "\xB8" => "\xD1\x91",
        "\xA1" => "\xD0\x8E",
        "\xA2" => "\xD1\x9E",
        "\xAA" => "\xD0\x84",
        "\xAF" => "\xD0\x87",
        "\xB2" => "\xD0\x86",
        "\xB3" => "\xD1\x96",
        "\xBA" => "\xD1\x94",
        "\xBF" => "\xD1\x97",
        "\x8C" => "\xD3\x90",
        "\x8D" => "\xD3\x96",
        "\x8E" => "\xD2\xAA",
        "\x8F" => "\xD3\xB2",
        "\x9C" => "\xD3\x91",
        "\x9D" => "\xD3\x97",
        "\x9E" => "\xD2\xAB",
        "\x9F" => "\xD3\xB3");

        return preg_replace('#[\x80-\xFF]#se', ' "$0" >= "\xF0" ? "\xD1".chr(ord("$0")-0x70) : ("$0" >= "\xC0" ? "\xD0".chr(ord("$0")-0x30) : (isset($table["$0"]) ? $table["$0"] : ""))', $string);
    }

    /**
     * Gets hexadecimal value in UTF-8 of a UTF-16 char code.
     * 
     * @param int $int UTF-16 char code
     * @return string the corresponding char hexadecimal value in UTF-8
     * @example for UTF-16:0x4E00 return "\\xE4\\xB8\\x80"
     */
    public static function hexUTF16($int)
    {
        if ($int < 0x80)
        {
            return '\\x' . dechex($int);
        }

        if ($int < 0x110000)
        {
            $bin = strrev(decbin($int));
            $c = strlen($bin);
            if ($int > 0x7FF)
            {
                if ($int < 0x10000)
                {
                    if ($c < 13)
                    {
                        $bin.=str_repeat('0', 13 - $c);
                    }
                }
                elseif ($c < 19)
                {
                    $bin.=str_repeat('0', 19 - $c);
                }
            }
            $str = str_split($bin, 6);
            for ($i = 0; $i < count($str) - 1; $i++)
            {
                $str[$i] = '\\x' . base_convert('10' . strrev($str[$i]), 2, 16);
            }
            $c = 8 - (strlen($str[$i]) + $i + 1);
            $str[$i] = str_repeat('1', $i + 1) . str_repeat('0', $c) . strrev($str[$i]);
            $str[$i] = '\\x' . base_convert($str[$i], 2, 16);
            $str = array_reverse($str);
            return implode('', $str);
        }

        throw new OutOfRangeException("UTF16 accept character codes under 0x10FFFF");
    }

    /**
     * Converts a string encoded using UTF-16 (little endian or big endian) to UTF-8.
     *
     * Original code was written by Andrew Walker who releases this code in public domain. This function
     * was modified in Spica to adapt to more context.
     *
     * @see    Use mb_convert_encoding($str, "UTF-8", "UTF-16LE") or mb_convert_encoding($str, "UTF-8", "UTF-16BE") which is more efficient
     * @see    http://bugs.php.net/bug.php?id=34776
     * @see    http://en.wikipedia.org/wiki/Byte_order_mark
     * @see    http://www.onicos.com/staff/iz/amuse/javascript/expert/utf.txt JavaScript functions
     * @param  string $str
     * @param  bool $parseBOM Default value: false
     * @param  bool $be If $parseBOM = false then $be (big endian) will be used, defaults to true
     * @return string
     */
    public static function utf16ToUtf8($str, $parseBOM = false, $be = true)
    {
        if (true === $parseBOM)
        {
            $c0 = ord($str[0]);
            $c1 = ord($str[1]);

            // 0xFE = 254 and 0xFF = 255
            if ($c0 == 0xFE && $c1 == 0xFF)
            {
                $be = true; // big endian (starts with \xFE\xFF)
            }
            else if ($c0 == 0xFF && $c1 == 0xFE)
            {
                $be = false; // little endian (starts with \xFF\xFE)
            }
            else
            {
                throw new InvalidArgumentException('Unable to find BOM in the header of the string');
            }

            $str = substr($str, 2);
        }

        $len = strlen($str);
        $dec = '';
        for ($i = 0; $i < $len; $i += 2)
        {
            $c = (true === $be) ? ord($str[$i]) << 8 | ord($str[$i + 1]) : ord($str[$i + 1]) << 8 | ord($str[$i]);

            if ($c >= 0x0001 && $c <= 0x007F)
            {
                $dec .= chr($c);
            }
            else if ($c > 0x07FF)
            {
                $dec .= chr(0xE0 | (($c >> 12) & 0x0F));
                $dec .= chr(0x80 | (($c >> 6) & 0x3F));
                $dec .= chr(0x80 | (($c >> 0) & 0x3F));
            }
            else
            {
                $dec .= chr(0xC0 | (($c >> 6) & 0x1F));
                $dec .= chr(0x80 | (($c >> 0) & 0x3F));
            }
        }

        return $dec;
    }

    /**
     * Unicode version of chr()
     *
     * @see    http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
     * @param  string $c
     * @return int
     */
    public static function unichr($c)
    {
        // The UTF-8 is 1 byte long for ASCII characters and up to 4 bytes for others.
        // 0x00-0x7E: plain ASCII
        // 0x7F A B C: Unicode character
        if ($c <= 0x7F)
        {
            return chr($c);
        }

        if ($c <= 0x7FF)
        {
            return chr(0xC0 | $c >> 6) . chr(0x80 | $c & 0x3F);
        }

        if ($c <= 0xFFFF)
        {
            return chr(0xE0 | $c >> 12) . chr(0x80 | $c >> 6 & 0x3F) . chr(0x80 | $c & 0x3F);
        }

        if ($c <= 0x10FFFF)
        {
            return chr(0xF0 | $c >> 18) . chr(0x80 | $c >> 12 & 0x3F) . chr(0x80 | $c >> 6 & 0x3F) . chr(0x80 | $c & 0x3F);
        }

        return false;
    }

    /**
     * Reads UTF-8 character from a file pointer.
     *
     * @see   http://wiki.tcl.tk/515
     * @see   http://en.wikipedia.org/wiki/UTF-8
     * @param resource $handle Resource created by fopen()
     * @return string
     */
    public static function readUtf8Char($handle)
    {
        $ch = fread($handle, 1); // leading byte

        // if it starts with 10xx xxx, it's a trailing char...
        // if it starts with 1111 10xx or 1111 110x
        // it's out of the 4 bytes range.
        // EDIT: added validation for 7 bytes seq and 0xff
        if (($ch & 0xc0) == 0x80 || ($ch & 0xfc) == 0xf8 || ($ch & 0xfe) == 0xfc || ($ch & 0xff) == 0xfe || $ch == 0xff)
        {
            throw new Exception("Illegal UTF-8 sequence");
        }

        // Characters below 0x80 (those requiring at most 7 bits, the ASCII
        // characters) are represented as a single byte, and are the same as in
        // their ASCII representations. So the example you quote, the line feed character, is represented as 0x0A.
        // U-00000000 - U-0000007F: 0xxxxxxx = 0 - 127 = 0x00 - 0x7F
        if (($ch & 0x80) == 0) // (0 to 127)
        {
            return $ch; // Single byte
        }

        // Characters from 0x80 to 0x7FF (those requiring between 8 and 11 bits)
        // are represented by two bytes. In binary, the bytes are 110xxxxx 10xxxxxx, the 11 bits being
        // distributed with high-order 5 in the first byte and the low-order 6 in second byte.
        // U-00000080 - U-000007FF: 110xxxxx 10xxxxxx
        // First byte: 0xC0 - 0xDF
        // Second byte: 0x80 - 0xBF
        if (($ch & 0xE0) == 0xC0) // \u0080-\u07FF (128 to 2,047)
        {
            // Two bytes
            return $ch.fread($handle, 1);
        }

        // Similarly, characters from 0x800 to 0xFFFF (those requiring between 12 and 16 bits)
        // are represented by three bytes. In binary, the bytes are
        // 1110xxxx 10xxxxxx 10xxxxxx, with 4 of the 16 bits in the first byte 6 in each of the second and third.
        // U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
        // First byte: 0xE0 - 0xEF
        // Following 2 bytes: 0x80 - 0xBF
        // So, if the sequence starts with 0xE0 to 0xEF, there must be
        // two bytes following between 0x80 and 0xBF. Fortunately we can often use 0x90 here, which is nop
        if (($ch & 0xF0) == 0xE0) // \u0800-\uFFFF (2,048 to 65,535)
        {
            // Three bytes
            return $ch.fread($handle, 2);
        }

        // Four-byte chars (U+10000-U+10FFFF)
        // U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
        // First byte: 0xF0 - 0xF7
        // Following 3 bytes: 0x80 - 0xBF
        if (($ch & 0xF8) == 0xF0) 
        {
            // Four bytes
            return $ch.fread($handle, 3);
        }

        throw new Exception("Illegal UTF-8 sequence");
    }
}

?>